1. Data Processing¶

1.1 Tweets¶

The dataset has no missing values.¶

Filter out tweets for Tesla¶

In [800]:
import pandas as pd
#Load file
file_path = "stock_tweets.csv"
data = pd.read_csv(file_path)
data.head(10)
Out[800]:
Date Tweet Stock Name Company Name
0 2022-09-29 23:41:16+00:00 Mainstream media has done an amazing job at br... TSLA Tesla, Inc.
1 2022-09-29 23:24:43+00:00 Tesla delivery estimates are at around 364k fr... TSLA Tesla, Inc.
2 2022-09-29 23:18:08+00:00 3/ Even if I include 63.0M unvested RSUs as of... TSLA Tesla, Inc.
3 2022-09-29 22:40:07+00:00 @RealDanODowd @WholeMarsBlog @Tesla Hahaha why... TSLA Tesla, Inc.
4 2022-09-29 22:27:05+00:00 @RealDanODowd @Tesla Stop trying to kill kids,... TSLA Tesla, Inc.
5 2022-09-29 22:25:53+00:00 @RealDanODowd @Tesla This is you https://t.co/... TSLA Tesla, Inc.
6 2022-09-29 22:24:22+00:00 For years @WholeMarsBlog viciously silenced @T... TSLA Tesla, Inc.
7 2022-09-29 22:23:54+00:00 $NIO just because I'm down money doesn't mean ... TSLA Tesla, Inc.
8 2022-09-29 22:23:28+00:00 50 likes for some $SPY $TSLA charts to study!\... TSLA Tesla, Inc.
9 2022-09-29 22:15:01+00:00 @MrJames__321 @KellyRoofing @TeslaSolar @elonm... TSLA Tesla, Inc.
In [22]:
#Check for shape
data.shape
Out[22]:
(80793, 4)
In [24]:
#Check for columns
column_names = data.columns.tolist()

print("Column names in the dataset:")
print(column_names)
Column names in the dataset:
['Date', 'Tweet', 'Stock Name', 'Company Name']
In [26]:
#Transform date time
data['Date'] = pd.to_datetime(data['Date'])
data['Date'] = data['Date'].dt.date
data.head()
Out[26]:
Date Tweet Stock Name Company Name
0 2022-09-29 Mainstream media has done an amazing job at br... TSLA Tesla, Inc.
1 2022-09-29 Tesla delivery estimates are at around 364k fr... TSLA Tesla, Inc.
2 2022-09-29 3/ Even if I include 63.0M unvested RSUs as of... TSLA Tesla, Inc.
3 2022-09-29 @RealDanODowd @WholeMarsBlog @Tesla Hahaha why... TSLA Tesla, Inc.
4 2022-09-29 @RealDanODowd @Tesla Stop trying to kill kids,... TSLA Tesla, Inc.
In [28]:
#Filter out tweets of Tesla 
tsla_data = data[data['Stock Name'] == 'TSLA']
#Drop out useless columns
tsla_data = tsla_data.drop(columns=['Company Name'])
tsla_data = tsla_data.drop(columns=['Stock Name'])
tsla_data.head()
Out[28]:
Date Tweet
0 2022-09-29 Mainstream media has done an amazing job at br...
1 2022-09-29 Tesla delivery estimates are at around 364k fr...
2 2022-09-29 3/ Even if I include 63.0M unvested RSUs as of...
3 2022-09-29 @RealDanODowd @WholeMarsBlog @Tesla Hahaha why...
4 2022-09-29 @RealDanODowd @Tesla Stop trying to kill kids,...
In [30]:
#Reverse the dataset
tsla_data = tsla_data.iloc[::-1]
tsla_data = tsla_data.reset_index(drop=True)
tsla_data.head()
Out[30]:
Date Tweet
0 2021-09-30 In other words, AMD has been giving Tesla pref...
1 2021-09-30 Get ready for a $TSLA _ _ _ _ _ _ Q3 delivery...
2 2021-09-30 Hold. On. Tight. $TSLA
3 2021-09-30 I agree with @freshjiva that $TSLA ‘s EV busin...
4 2021-09-30 Playing in the dirt and #chasingsunsets\n@tesl...
In [32]:
#Processed dataset
tsla_data.shape
Out[32]:
(37422, 2)
In [34]:
#Check for date
start_date = tsla_data['Date'].min()
end_date = tsla_data['Date'].max()
print(f"Start Date: {start_date}")
print(f"End Date: {end_date}")
Start Date: 2021-09-30
End Date: 2022-09-29
In [36]:
import matplotlib.pyplot as plt

#Visualize daily tweet count
daily_tweet_counts = tsla_data.groupby('Date').size().reset_index(name='Tweet Count')

print("Daily Tweet Counts:")
print(daily_tweet_counts)

#Visualize
plt.figure(figsize=(10, 6))
plt.plot(daily_tweet_counts['Date'], daily_tweet_counts['Tweet Count'], linestyle='-')
plt.title("Daily Tweet Counts")
plt.xlabel("Date")
plt.ylabel("Number of Tweets")
plt.grid(True)
plt.xticks(rotation=45) 
plt.tight_layout() 
plt.show()
Daily Tweet Counts:
           Date  Tweet Count
0    2021-09-30           90
1    2021-10-01           94
2    2021-10-02          116
3    2021-10-03           61
4    2021-10-04          119
..          ...          ...
360  2022-09-25           36
361  2022-09-26           72
362  2022-09-27           85
363  2022-09-28           75
364  2022-09-29          112

[365 rows x 2 columns]
No description has been provided for this image
In [43]:
#Save file
new_file_path = "TSLA_Tweets_data.csv"
tsla_data.to_csv(new_file_path, index=False)
In [45]:
#Save daily tweet count as csv
daily_tweet_counts = pd.DataFrame(daily_tweet_counts)
daily_tweet_counts.to_csv("daily_tweet_counts.csv", index = False)

1.2 Stock¶

No missing or duplicated data.¶

Filterd out stock of Tesla¶

In [47]:
#Load file for stock
file_path = "stock_yfinance_data.csv"

data = pd.read_csv(file_path)
data.head(5)
Out[47]:
Date Open High Low Close Adj Close Volume Stock Name
0 2021-09-30 260.333344 263.043335 258.333344 258.493347 258.493347 53868000 TSLA
1 2021-10-01 259.466675 260.260010 254.529999 258.406677 258.406677 51094200 TSLA
2 2021-10-04 265.500000 268.989990 258.706665 260.510010 260.510010 91449900 TSLA
3 2021-10-05 261.600006 265.769989 258.066681 260.196655 260.196655 55297800 TSLA
4 2021-10-06 258.733337 262.220001 257.739990 260.916656 260.916656 43898400 TSLA
In [49]:
#Check for shape
data.shape
Out[49]:
(6300, 8)
In [51]:
#Check for columns
column_names = data.columns.tolist()

print("Column names in the dataset:")
print(column_names)
Column names in the dataset:
['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume', 'Stock Name']
In [53]:
#Check for all companies
data['Stock Name'].unique()
Out[53]:
array(['TSLA', 'MSFT', 'PG', 'META', 'AMZN', 'GOOG', 'AMD', 'AAPL',
       'NFLX', 'TSM', 'KO', 'F', 'COST', 'DIS', 'VZ', 'CRM', 'INTC', 'BA',
       'BX', 'NOC', 'PYPL', 'ENPH', 'NIO', 'ZS', 'XPEV'], dtype=object)
In [55]:
#Count of each stock name in the dataset
stock_counts = data['Stock Name'].value_counts()

print("Count of each stock name in the dataset:")
print(stock_counts)
Count of each stock name in the dataset:
Stock Name
TSLA    252
DIS     252
ZS      252
NIO     252
ENPH    252
PYPL    252
NOC     252
BX      252
BA      252
INTC    252
CRM     252
VZ      252
COST    252
MSFT    252
F       252
KO      252
TSM     252
NFLX    252
AAPL    252
AMD     252
GOOG    252
AMZN    252
META    252
PG      252
XPEV    252
Name: count, dtype: int64
In [57]:
#Filter out TSLA stock
tsla_stock_data = data[data['Stock Name'] == 'TSLA']
#Transfer date time data
tsla_stock_data['Date'] = pd.to_datetime(tsla_stock_data['Date'])
tsla_stock_data.head(5)
C:\Users\LJT19\AppData\Local\Temp\ipykernel_10904\4277104264.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tsla_stock_data['Date'] = pd.to_datetime(tsla_stock_data['Date'])
Out[57]:
Date Open High Low Close Adj Close Volume Stock Name
0 2021-09-30 260.333344 263.043335 258.333344 258.493347 258.493347 53868000 TSLA
1 2021-10-01 259.466675 260.260010 254.529999 258.406677 258.406677 51094200 TSLA
2 2021-10-04 265.500000 268.989990 258.706665 260.510010 260.510010 91449900 TSLA
3 2021-10-05 261.600006 265.769989 258.066681 260.196655 260.196655 55297800 TSLA
4 2021-10-06 258.733337 262.220001 257.739990 260.916656 260.916656 43898400 TSLA
In [61]:
## Check for duplicates
duplicates = tsla_stock_data[tsla_stock_data.duplicated(subset='Date', keep=False)]
if not duplicates.empty:
    print("Duplicates found:")
    print(duplicates)
else:
    print("No duplicates found.")

# Check for missing values
missing_values = tsla_stock_data.isnull().sum()
if missing_values.sum() > 0:
    print("Missing values found:")
    print(missing_values)
else:
    print("No missing values found.")
No duplicates found.
No missing values found.
In [63]:
#Check for date
start_date = tsla_stock_data['Date'].min()
end_date = tsla_stock_data['Date'].max()
print(f"Start Date: {start_date}")
print(f"End Date: {end_date}")
Start Date: 2021-09-30 00:00:00
End Date: 2022-09-29 00:00:00
In [67]:
#Visualization for Stock price over time
import matplotlib.dates as mdates
data_columns = {
    'Open': {'color': 'blue', 'linestyle': '--'},
    'Close': {'color': 'red', 'linestyle': '-'},
    'High': {'color': 'green', 'linestyle': '-.', 'linewidth': 1.5},
    'Low': {'color': 'orange', 'linestyle': ':', 'linewidth': 1.5}
}

plt.figure(figsize=(12, 8))

for column, style in data_columns.items():
    plt.plot(tsla_stock_data['Date'], tsla_stock_data[column], label=column, **style)

plt.title("Stock Price of Tesla Over Time", fontsize=16, fontweight='bold')
plt.xlabel("Date", fontsize=14)
plt.ylabel("Price", fontsize=14)
plt.legend(fontsize=12)
plt.grid(True, linestyle='--', alpha=0.7)

plt.gca().xaxis.set_major_locator(mdates.MonthLocator(interval=2))
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
No description has been provided for this image
In [69]:
#Save
#Delete Stock Name column
tsla_stock_data = tsla_stock_data.drop(columns=['Stock Name'])
new_file_path = "TSLA_stock_data.csv" 
tsla_stock_data.to_csv(new_file_path, index=False)
In [71]:
#Processed data info
tsla_data
Out[71]:
Date Tweet
0 2021-09-30 In other words, AMD has been giving Tesla pref...
1 2021-09-30 Get ready for a $TSLA _ _ _ _ _ _ Q3 delivery...
2 2021-09-30 Hold. On. Tight. $TSLA
3 2021-09-30 I agree with @freshjiva that $TSLA ‘s EV busin...
4 2021-09-30 Playing in the dirt and #chasingsunsets\n@tesl...
... ... ...
37417 2022-09-29 @RealDanODowd @Tesla Stop trying to kill kids,...
37418 2022-09-29 @RealDanODowd @WholeMarsBlog @Tesla Hahaha why...
37419 2022-09-29 3/ Even if I include 63.0M unvested RSUs as of...
37420 2022-09-29 Tesla delivery estimates are at around 364k fr...
37421 2022-09-29 Mainstream media has done an amazing job at br...

37422 rows × 2 columns

1.3 Stock Index¶

Nasdaq Index and S&P 500 Index found on the internet¶

Filter out data from 2021.9.30 to 2022.9.29¶

In [3]:
import pandas as pd

# Load file
file_path = 'HistoricalData_1742894586653.csv'
data = pd.read_csv(file_path)

# Convert the date column to datetime format
data['Date'] = pd.to_datetime(data['Date'], format='%m/%d/%Y')

# Filter data within the specified date range
start_date = '2021-09-30'
end_date = '2022-09-29'
filtered_data = data[(data['Date'] >= start_date) & (data['Date'] <= end_date)]

# Sort the filtered data by date in ascending order
filtered_data = filtered_data.sort_values(by='Date', ascending=True)

# Extract the closing price column and rename the column names
Nasdaq = filtered_data[['Date', 'Close/Open']]
Nasdaq.columns = ['Date', 'Nasdaq_Index']  # Rename column names
Nasdaq = Nasdaq.reset_index(drop=True)

# Print the result
print(Nasdaq)
Nasdaq.to_csv('Nasdaq_Index.csv', index=False)
          Date  Nasdaq_Index
0   2021-09-30      14689.62
1   2021-10-01      14791.87
2   2021-10-04      14472.12
3   2021-10-05      14674.15
4   2021-10-06      14766.75
..         ...           ...
247 2022-09-23      11311.24
248 2022-09-26      11254.11
249 2022-09-27      11271.75
250 2022-09-28      11493.83
251 2022-09-29      11164.78

[252 rows x 2 columns]
In [5]:
# Load file of sp500 index
file_path = 'sp500_index.csv'
data = pd.read_csv(file_path)
data
Out[5]:
Date S&P500 Nasdaq_Index
0 2021-09-30 4307.54 14689.62
1 2021-10-01 4357.04 14791.87
2 2021-10-04 4300.46 14472.12
3 2021-10-05 4345.72 14674.15
4 2021-10-06 4363.55 14766.75
... ... ... ...
247 2022-09-23 3693.23 11311.24
248 2022-09-26 3655.04 11254.11
249 2022-09-27 3647.29 11271.75
250 2022-09-28 3719.04 11493.83
251 2022-09-29 3640.47 11164.78

252 rows × 3 columns

In [7]:
#filter out date from 2021.9.30 to 2022.9.29
filtered_data = data[(data['Date'] >= start_date) & (data['Date'] <= end_date)]
In [9]:
# reset index
filtered_data = filtered_data.reset_index(drop=True)

# merge two index
filtered_data['Nasdaq_Index'] = Nasdaq['Nasdaq_Index']
filtered_data
Out[9]:
Date S&P500 Nasdaq_Index
0 2021-09-30 4307.54 14689.62
1 2021-10-01 4357.04 14791.87
2 2021-10-04 4300.46 14472.12
3 2021-10-05 4345.72 14674.15
4 2021-10-06 4363.55 14766.75
... ... ... ...
247 2022-09-23 3693.23 11311.24
248 2022-09-26 3655.04 11254.11
249 2022-09-27 3647.29 11271.75
250 2022-09-28 3719.04 11493.83
251 2022-09-29 3640.47 11164.78

252 rows × 3 columns

In [11]:
#Save file
filtered_data.to_csv('Index.csv', index=False)
In [13]:
# Visualization of two index over time
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from datetime import datetime
filtered_data['Date'] = pd.to_datetime(filtered_data['Date'])

filtered_data = filtered_data.sort_values(by='Date')
plt.figure(figsize=(10, 6))
plt.plot(filtered_data['Date'], filtered_data['Nasdaq_Index'], linestyle='-')
plt.title("Nasdaq Index Over Time")
plt.xlabel("Date")
plt.ylabel("Nasdaq Index")


plt.grid(True)
plt.gca().xaxis.set_major_locator(mdates.MonthLocator())
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
No description has been provided for this image
In [15]:
plt.figure(figsize=(10, 6))
plt.plot(filtered_data['Date'], filtered_data['S&P500'], color='red', linestyle='-')
plt.title("S&P 500 Index Over Time")
plt.xlabel("Date")
plt.ylabel("S&P 500 Index")
plt.grid(True)
plt.gca().xaxis.set_major_locator(mdates.MonthLocator())
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
No description has been provided for this image

2. Sentiment Analysis¶

2.1 Generate sentiment score¶

In [155]:
import pandas as pd
import numpy as np
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import unicodedata
In [157]:
tsla_data.head()
Out[157]:
Date Tweet
0 2021-09-30 In other words, AMD has been giving Tesla pref...
1 2021-09-30 Get ready for a $TSLA _ _ _ _ _ _ Q3 delivery...
2 2021-09-30 Hold. On. Tight. $TSLA
3 2021-09-30 I agree with @freshjiva that $TSLA ‘s EV busin...
4 2021-09-30 Playing in the dirt and #chasingsunsets\n@tesl...
In [159]:
# Add sentiment columns
sent_df = tsla_data.copy()
sent_df["sentiment_score"] = ''
sent_df["Positive"] = ''
sent_df["Neutral"] = ''
sent_df["Negative"] = ''
sent_df.head()
Out[159]:
Date Tweet sentiment_score Positive Neutral Negative
0 2021-09-30 In other words, AMD has been giving Tesla pref...
1 2021-09-30 Get ready for a $TSLA _ _ _ _ _ _ Q3 delivery...
2 2021-09-30 Hold. On. Tight. $TSLA
3 2021-09-30 I agree with @freshjiva that $TSLA ‘s EV busin...
4 2021-09-30 Playing in the dirt and #chasingsunsets\n@tesl...
In [161]:
%%time
# Generate Sentiment score by SentimentIntensityAnalyzer
sentiment_analyzer = SentimentIntensityAnalyzer()
for indx, row in sent_df.T.items():
    try:
        sentence_i = unicodedata.normalize('NFKD', sent_df.loc[indx, 'Tweet'])
        sentence_sentiment = sentiment_analyzer.polarity_scores(sentence_i)
        sent_df.at[indx, 'sentiment_score'] = sentence_sentiment['compound']
        sent_df.at[indx, 'Negative'] = sentence_sentiment['neg']
        sent_df.at[indx, 'Neutral'] = sentence_sentiment['neu']
        sent_df.at[indx, 'Positive'] = sentence_sentiment['pos']
    except TypeError:
        print (sent_df.loc[indexx, 'Tweet'])
        print (indx)
        break
CPU times: total: 8.52 s
Wall time: 8.54 s
In [162]:
# Check for result
sent_df.head()
Out[162]:
Date Tweet sentiment_score Positive Neutral Negative
0 2021-09-30 In other words, AMD has been giving Tesla pref... 0.659 0.166 0.834 0.0
1 2021-09-30 Get ready for a $TSLA _ _ _ _ _ _ Q3 delivery... 0.4215 0.257 0.743 0.0
2 2021-09-30 Hold. On. Tight. $TSLA 0.0 0.0 1.0 0.0
3 2021-09-30 I agree with @freshjiva that $TSLA ‘s EV busin... 0.5719 0.175 0.747 0.078
4 2021-09-30 Playing in the dirt and #chasingsunsets\n@tesl... -0.1531 0.148 0.656 0.197
In [163]:
# Convert the 'Date' column to datetime format
sent_df['Date'] = pd.to_datetime(sent_df['Date'])

# Extract only the date part from the datetime column (remove time information)
sent_df['Date'] = sent_df['Date'].dt.date
#sent_df = sent_df.drop(columns=['Negative', 'Positive', 'Neutral', 'Stock Name', 'Company Name'])
sent_df.head()
Out[163]:
Date Tweet sentiment_score Positive Neutral Negative
0 2021-09-30 In other words, AMD has been giving Tesla pref... 0.659 0.166 0.834 0.0
1 2021-09-30 Get ready for a $TSLA _ _ _ _ _ _ Q3 delivery... 0.4215 0.257 0.743 0.0
2 2021-09-30 Hold. On. Tight. $TSLA 0.0 0.0 1.0 0.0
3 2021-09-30 I agree with @freshjiva that $TSLA ‘s EV busin... 0.5719 0.175 0.747 0.078
4 2021-09-30 Playing in the dirt and #chasingsunsets\n@tesl... -0.1531 0.148 0.656 0.197
In [164]:
# Save file
sent_df.to_csv('Tweet_sentiment_score.csv', encoding='utf-8',index=False)
In [165]:
sent_df.groupby('Date')['sentiment_score'].mean().head(20)
Out[165]:
Date
2021-09-30    0.231552
2021-10-01    0.233704
2021-10-02     0.27194
2021-10-03     0.27157
2021-10-04    0.135388
2021-10-05    0.069445
2021-10-06     0.19994
2021-10-07    0.192548
2021-10-08    0.220011
2021-10-09    0.294931
2021-10-10    0.244551
2021-10-11    0.185286
2021-10-12    0.191255
2021-10-13    0.160097
2021-10-14    0.158425
2021-10-15    0.080318
2021-10-16    0.220176
2021-10-17    0.215528
2021-10-18    0.219143
2021-10-19    0.155566
Name: sentiment_score, dtype: object
In [166]:
# Calculate average of sentiment score
twitter_df = sent_df.groupby('Date')['sentiment_score'].mean()

print(twitter_df.shape)
# Save file
twitter_df.to_csv('Tweet_sentiment_score(every day).csv', encoding='utf-8',index=True)
(365,)
In [167]:
sent_df['Date'] = pd.to_datetime(sent_df['Date'])
##weekly_sentiment_trade = sent_df.groupby(pd.Grouper(key='Date', freq='W-FRI'))['sentiment_score'].mean() #freq='W-FRI'
weekly_sentiment_trade = sent_df.groupby(pd.Grouper(key='Date', freq='W'))['sentiment_score'].mean()
weekly_sentiment_trade.head()
# Average sentiment score on a weekly basis (up to Sunday).
#weekly_sentiment_trade.to_csv('Tweet_sentiment_score(every week).csv', encoding='utf-8',index=True)
Out[167]:
Date
2021-10-03    0.251852
2021-10-10    0.188605
2021-10-17    0.166151
2021-10-24    0.184919
2021-10-31    0.196194
Freq: W-SUN, Name: sentiment_score, dtype: object
In [173]:
monthly_sentiment = sent_df.groupby(pd.Grouper(key='Date', freq='ME'))['sentiment_score'].mean()

monthly_sentiment.head()
#The average sentiment score on a monthly basis (up to the end of the month).
#monthly_sentiment.to_csv('Tweet_sentiment_score(every month).csv', encoding='utf-8',index=True) 
Out[173]:
Date
2021-09-30    0.231552
2021-10-31    0.191146
2021-11-30    0.191373
2021-12-31    0.188472
2022-01-31     0.14224
Freq: ME, Name: sentiment_score, dtype: object
In [177]:
#categorize sentiment scores into 'positive', 'negative', or 'neutral'
sent_df = sent_df.drop(columns=['Negative', 'Positive', 'Neutral'])
def categorize_sentiment(score):
    if score >= 0.5:
        return 'positive'
    elif score <= -0.5:
        return 'negative'
    else:
        return 'neutral'

sent_df['sentiment'] = sent_df['sentiment_score'].apply(categorize_sentiment)

sent_df.head()
Out[177]:
Date Tweet sentiment_score sentiment
0 2021-09-30 In other words, AMD has been giving Tesla pref... 0.659 positive
1 2021-09-30 Get ready for a $TSLA _ _ _ _ _ _ Q3 delivery... 0.4215 neutral
2 2021-09-30 Hold. On. Tight. $TSLA 0.0 neutral
3 2021-09-30 I agree with @freshjiva that $TSLA ‘s EV busin... 0.5719 positive
4 2021-09-30 Playing in the dirt and #chasingsunsets\n@tesl... -0.1531 neutral
In [181]:
# Categorize sentiment scores into numerical values:
def categorize_sentiment2(score):
    if score >= 0.4:
        return 1
    elif score <= -0.4:
        return 0
    else:
        return 0.5

sent_df['sentiment'] = sent_df['sentiment_score'].apply(categorize_sentiment2)

sent_df.head()
Out[181]:
Date Tweet sentiment_score sentiment
0 2021-09-30 In other words, AMD has been giving Tesla pref... 0.659 1.0
1 2021-09-30 Get ready for a $TSLA _ _ _ _ _ _ Q3 delivery... 0.4215 1.0
2 2021-09-30 Hold. On. Tight. $TSLA 0.0 0.5
3 2021-09-30 I agree with @freshjiva that $TSLA ‘s EV busin... 0.5719 1.0
4 2021-09-30 Playing in the dirt and #chasingsunsets\n@tesl... -0.1531 0.5
In [185]:
#sent_df = sent_df.drop(columns=['Negative', 'Positive', 'Neutral'])
#sent_df.head()
In [183]:
# Save the file (sentiment scores of all tweets)

#sent_df.to_csv('Tweet_sentiment_score_category.csv', encoding='utf-8',index=False)

2.2 LSTM for sentiment analysis¶

The following is modified from the tutorial content¶

In [195]:
import pandas as pd
import numpy as np
import nltk
nltk.download('punkt') # Used for sentence tokenizer
nltk.download('stopwords')
nltk.download('punkt_tab')
import string
from nltk.tokenize import word_tokenize #nltk: natural language toolkit
from nltk.corpus import stopwords
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\LJT19\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\LJT19\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\LJT19\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
In [197]:
lstm_df = sent_df.copy()
# Preprocessing text data 
review_lines = list()
lines = lstm_df['Tweet'].values.tolist()
for line in lines:
    tokens = word_tokenize(line)
    # convert to lower case
    tokens = [w.lower() for w in tokens]
    # remove punctuation from each word
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    # remove remaining tokens that are not alphabetic
    words = [word for word in stripped if word.isalpha()] 
    # filter out stop words
    stop_words = set(stopwords.words('english'))
    words = [w for w in words if not w in stop_words]
    review_lines.append(words)
In [198]:
# check the number of reviews and the data frame shape
len(review_lines)
lstm_df.shape
Out[198]:
(37422, 4)
In [199]:
import gensim

EMBEDDING_DIM = 100

# Train Word2Vec model
model = gensim.models.Word2Vec(sentences=review_lines, vector_size=EMBEDDING_DIM, window=5, min_count=1)

# Get vocabulary size
words = list(model.wv.index_to_key)  # Updated method for getting vocab
print('Vocabulary size: %d' % len(words))
Vocabulary size: 30515
In [200]:
print(model.wv['root'])
print(len(model.wv['root']))
[-0.02639537  0.04859987  0.02323065  0.01520963  0.02069867 -0.06345103
  0.00237939  0.10403991 -0.02221519 -0.01791887 -0.00426699 -0.06221278
 -0.00984189  0.01198509  0.02840637 -0.03173357  0.02984535 -0.02477416
 -0.01590225 -0.06706116  0.01942845  0.01528488  0.01606948 -0.01145657
 -0.00077515 -0.00854264 -0.00818149 -0.01306987 -0.03472029  0.00400812
  0.0404654   0.00851348  0.01482215 -0.04922578 -0.01271665  0.05398022
  0.03191518 -0.0242642  -0.0316348  -0.00399408 -0.00492199 -0.02592164
 -0.01037458  0.00309789  0.03754494  0.00466243 -0.04174955 -0.02683674
  0.03998186  0.01295525  0.02872487 -0.03530835 -0.04714225 -0.0012458
 -0.02732489 -0.00957228  0.03153846 -0.004599   -0.02782652  0.00469119
 -0.00240873  0.01199539  0.01057674 -0.03354356 -0.03389709  0.00507666
  0.01660249  0.01945366 -0.04706703  0.01470122  0.00744584  0.03220437
  0.04197431  0.00299182  0.02270749  0.02610776  0.00666063 -0.00461335
 -0.02119539  0.00996126 -0.03195732  0.00079824  0.00183682  0.0245774
 -0.00269652  0.00316148  0.00894271  0.02376296  0.04649055  0.0071784
  0.0124121   0.01912157  0.01785881 -0.01374316  0.05322729  0.02678355
  0.02244523 -0.03449194 -0.01159479 -0.00490714]
100
In [201]:
# save model in ASCII (word2vec) format
filename = 'group_project_embedding_word2vec.txt'
model.wv.save_word2vec_format(filename, binary=False) 
In [202]:
# Finding out similar words
model.wv.most_similar('tesla', topn =5)
model.wv.similar_by_word("car")
Out[202]:
[('ever', 0.9052832722663879),
 ('miles', 0.9046816229820251),
 ('guzzling', 0.9017306566238403),
 ('teslas', 0.9010254144668579),
 ('safest', 0.8957082033157349),
 ('carriers', 0.8872512578964233),
 ('drive', 0.8776369094848633),
 ('world', 0.8759986758232117),
 ('fastest', 0.8705390691757202),
 ('autonomous', 0.8698485493659973)]
In [213]:
# Find words related to "positive" (to measure positive sentiment)
print(model.wv.most_similar('positive', topn=10))

# Find words related to "negative" (to measure negative sentiment)
print(model.wv.most_similar('negative', topn=10))
[('continue', 0.9677192568778992), ('fundamentals', 0.9598056077957153), ('fall', 0.9556040167808533), ('ridiculously', 0.9524328112602234), ('cause', 0.9514209032058716), ('raise', 0.9509127736091614), ('reflect', 0.9463772773742676), ('assuming', 0.9457159638404846), ('given', 0.9450112581253052), ('peaked', 0.9441218972206116)]
[('worry', 0.9699762463569641), ('portfolios', 0.969102144241333), ('strategy', 0.968085765838623), ('cover', 0.9671775698661804), ('cushion', 0.9660492539405823), ('pain', 0.9642733335494995), ('institutions', 0.9641199707984924), ('fear', 0.9635016918182373), ('fundamentally', 0.963267982006073), ('confidence', 0.9623303413391113)]
In [215]:
# Word that best satisfies analogy relations
# Example usage of analogy (commented out):
# model.wv.most_similar_cosmul(positive=['woman', 'king'], negative=['man'])
# model.wv.most_similar(positive=['woman', 'king'], negative=['man'])

# Analyze analogy relationships
# For example, "elon" is to "tesla" as "tim_cook" is to what?
# Here, we are trying to find a word that completes the analogy:
# "boss" is to "tesla" as "elon" is to what?
model.wv.most_similar(positive=['boss', 'tesla'], negative=['elon'], topn=5)
Out[215]:
[('beautiful', 0.853935718536377),
 ('stargazerplaid', 0.8470975160598755),
 ('gigaworkshop', 0.8423700928688049),
 ('tesmaniancom', 0.8418003916740417),
 ('snowblower', 0.8417527079582214)]
In [217]:
# Finding the odd word out
print(model.wv.doesnt_match("tesla autopilot software battery recall".split()))
print(model.wv.doesnt_match("elonmusk ceo innovation problem success".split()))
tesla
elonmusk
In [219]:
# Similarity between words
#print(model.wv.similarity('cat', 'movie'))
In [221]:
# Suppress excessive printing and load embeddings efficiently
embeddings_index = {}

with open('group_project_embedding_word2vec.txt', 'r', encoding='utf-8') as f:
    for i, line in enumerate(f):
        values = line.split()
        word = values[0]  # First value is the word
        coefs = np.asarray(values[1:], dtype='float32')  # Remaining values are embeddings
        embeddings_index[word] = coefs

print(f"Loaded {len(embeddings_index)} word vectors.")
Loaded 30516 word vectors.
In [223]:
#Let's check the vector for word "men"
embeddings_index['tesla']
Out[223]:
array([-0.71105003,  0.87905407, -0.46093175,  0.2798589 ,  0.4994845 ,
       -1.2032409 , -0.347253  ,  2.1046436 ,  0.04646157, -0.64542085,
       -0.7883267 , -0.55146027, -0.04696903,  0.62486345, -0.13276118,
       -0.3724636 ,  1.2867846 ,  0.6873424 , -0.07962274, -1.2975892 ,
       -0.09589551, -0.18267868,  0.44770238, -1.6075722 ,  1.4333322 ,
       -0.6432317 , -0.28130063,  0.16019958, -0.25701803,  0.18099813,
        0.56565833, -0.21602117, -1.2137904 , -1.0170438 , -0.12016281,
        1.2887949 ,  0.18248892, -0.5955466 ,  0.0161146 , -0.3073855 ,
        0.29642892,  0.06306926, -0.51192147, -0.43064877,  0.6596848 ,
        1.2730328 , -1.4054806 , -1.1332502 ,  0.661447  ,  0.3076126 ,
        0.9193487 , -0.6246483 , -1.6771576 ,  0.51781726, -0.02376858,
       -0.58212614,  0.05695917, -0.00985799, -0.8893998 , -1.0885311 ,
        0.23131801,  0.8802511 ,  0.03330267, -0.66670716, -0.2246818 ,
        0.29192406,  1.3226191 , -1.0850828 , -0.8439994 ,  0.17294247,
       -0.25068647,  1.3726964 ,  0.51728827,  0.04200741, -0.01277055,
        0.49540782,  0.49373797, -0.71228194, -0.9718147 ,  1.1175257 ,
        0.3502359 , -0.15345189, -0.33879098,  0.14311174, -0.15162936,
        0.48832226, -1.1551956 , -0.74478936,  0.6889191 , -0.22727433,
        0.52805185,  0.74544823,  0.60426563, -0.6128503 ,  1.7455885 ,
        0.39142498,  1.1795077 , -0.7301047 ,  0.39732546,  0.08327533],
      dtype=float32)
In [235]:
import torch
from torch.nn.utils.rnn import pad_sequence
from nltk.tokenize import word_tokenize

VALIDATION_SPLIT = 0.2

# Use NLTK tokenizer
tokenizer_obj = word_tokenize

# Tokenize the text and convert words to indices
vocab = {}  # Dictionary to store word indices
word_index = {}
index = 1  # Start indexing from 1 (0 is reserved for padding)

# Build vocabulary from the dataset
tokenized_texts = []
for line in review_lines:
    if isinstance(line, list):  
        line = ' '.join(line)  # Convert list to string if necessary
    tokens = tokenizer_obj(line)  # Tokenize using NLTK
    tokenized_texts.append(tokens)

    for token in tokens:
        if token not in vocab:
            vocab[token] = index
            word_index[token] = index 
            index += 1

# Convert tokens to numerical sequences
sequences = [[vocab[token] for token in tokens] for tokens in tokenized_texts]

# Padding sequences
sequences = [torch.tensor(seq, dtype=torch.long) for seq in sequences]
sequences = pad_sequence(sequences, batch_first=True, padding_value=0)

print("Vocabulary size:", len(vocab))
print("Sample tokenized sequence:", sequences[0])
Vocabulary size: 30515
Sample tokenized sequence: tensor([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0])
In [237]:
max_length = 100  # Maximum length of sentences

# Pad sequences
print('Found %s unique tokens.' % len(vocab))

# Convert to tensor and pad
review_pad = pad_sequence(sequences, batch_first=True, padding_value=0)  # Padding with 0
review_pad = review_pad[:, :max_length]  # Ensure max_length constraint

sentiment = torch.tensor(lstm_df['sentiment'].values, dtype=torch.long)

print('Shape of review tensor:', review_pad.shape)
print('Shape of sentiment tensor:', sentiment.shape)
Found 30515 unique tokens.
Shape of review tensor: torch.Size([37422, 49])
Shape of sentiment tensor: torch.Size([37422])
In [241]:
review_pad[2000]
Out[241]:
tensor([ 603,  225,  104, 1250,  179, 2544, 1540, 6054, 6055,  465, 6056,   14,
        4982,  875,   15, 6057,   15,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0])
In [243]:
# Split the dataset into training and validation

# Create indices using torch.arange
indices = torch.arange(sequences.shape[0])

print("Original indices:", indices)

# Apply the shuffled indices to sequences and sentiment tensors
sequences = sequences[indices]
sentiment_tensor = sentiment[indices]

# Calculate the number of validation samples
num_validation_samples = int(VALIDATION_SPLIT * sequences.shape[0])

print("Number of validation samples:", num_validation_samples)
Original indices: tensor([    0,     1,     2,  ..., 37419, 37420, 37421])
Number of validation samples: 7484
In [245]:
# Get the number of samples
num_samples = len(review_pad)

# Generate shuffled indices
indices = np.arange(num_samples)
np.random.seed(42)  # Set random seed for reproducibility
np.random.shuffle(indices)

# Apply shuffled indices to split data
X_shuffled = review_pad[indices]
y_shuffled = sentiment[indices]

# Perform split
num_validation_samples = int(0.2 * num_samples)  # Assuming 80-20 train-test split
X_train_pad = X_shuffled[:-num_validation_samples]
y_train = y_shuffled[:-num_validation_samples]
X_test_pad = X_shuffled[-num_validation_samples:]
y_test = y_shuffled[-num_validation_samples:]
In [247]:
print('Shape of Training dataset X:', X_train_pad.shape)
print('Shape of Training dataset Y:', y_train.shape)
print('Shape of Validation dataset X:', X_test_pad.shape)
print('Shape of Valodation daatset Y:', y_test.shape)
Shape of Training dataset X: torch.Size([29938, 49])
Shape of Training dataset Y: torch.Size([29938])
Shape of Validation dataset X: torch.Size([7484, 49])
Shape of Valodation daatset Y: torch.Size([7484])
In [249]:
# Define embedding dimensions
EMBEDDING_DIM = 100

# Define model hyperparameters
max_length = 100

# Number of words in vocabulary (+1 for padding index)
num_words = len(word_index) + 1

# Initialize the embedding matrix as a tensor filled with zeros
embedding_matrix = torch.zeros((num_words, EMBEDDING_DIM))

# Populate embedding matrix with pre-trained embeddings
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    
    if embedding_vector is not None:
        # Convert embedding_vector to a PyTorch tensor and assign it
        embedding_matrix[i] = torch.tensor(embedding_vector, dtype=torch.float32)
In [251]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# Hyperparameters
batch_size = 128
epochs = 15

X_train_tensor = X_train_pad.clone().detach().to(dtype=torch.long)
y_train_tensor = y_train.clone().detach().to(dtype=torch.float32)
X_test_tensor = X_test_pad.clone().detach().to(dtype=torch.long)
y_test_tensor = y_test.clone().detach().to(dtype=torch.float32)

# Create DataLoader for training and validation
train_data = TensorDataset(X_train_tensor, y_train_tensor)
test_data = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=True)

# Define the Model
class SentimentModel(nn.Module):
    def __init__(self, num_words, EMBEDDING_DIM, embedding_matrix, max_length):
        super(SentimentModel, self).__init__()

        self.embedding_layer = nn.Embedding.from_pretrained(embedding_matrix, freeze=True, padding_idx=0)

        # Single-layer LSTM with dropout
        self.lstm_layer = nn.LSTM(
            input_size=EMBEDDING_DIM, 
            hidden_size=32,
            num_layers=1,  
            batch_first=True,
            dropout=0,  # Dropout is ignored for single-layer LSTMs
        )

        # Fully connected layer
        self.fc_layer = nn.Linear(32, 1)

        # Activation function
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        embedded = self.embedding_layer(x)
        lstm_out, _ = self.lstm_layer(embedded)  # LSTM returns (output, (h_n, c_n))
        lstm_out = lstm_out[:, -1, :]  # Take last LSTM output
        logits = self.fc_layer(lstm_out)
        return self.sigmoid(logits).squeeze()  # Ensure output is a tensor

# Example: Instantiating the model
model = SentimentModel(num_words, EMBEDDING_DIM, embedding_matrix, max_length)

# Binary Cross Entropy Loss
criterion = nn.BCELoss()

# Adam Optimizer with L2 Regularization
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Print model summary
print(model)
SentimentModel(
  (embedding_layer): Embedding(30516, 100, padding_idx=0)
  (lstm_layer): LSTM(100, 32, batch_first=True)
  (fc_layer): Linear(in_features=32, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)
In [253]:
print("Train...")
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct = 0
    total = 0

    for batch in train_loader:
        X_batch, y_batch = batch
        y_batch = y_batch

        # Forward pass
        predictions = model(X_batch)

        # Compute loss
        loss = criterion(predictions, y_batch)
        total_loss += loss.item()

        # Compute accuracy
        predicted_labels = (predictions >= 0.5).float()
        correct += (predicted_labels == y_batch).sum().item()
        total += y_batch.size(0)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    train_loss = total_loss / len(train_loader)
    train_accuracy = correct / total

    # Validation step
    with torch.no_grad():  
        model.eval()
        val_loss = 0
        val_correct = 0
        val_total = 0

        for batch in test_loader:
            X_batch, y_batch = batch
            y_batch = y_batch
            
            predictions = model(X_batch)

            loss = criterion(predictions, y_batch)
            val_loss += loss.item()

            predicted_labels = (predictions >= 0.5).float()
            val_correct += (predicted_labels == y_batch).sum().item()
            val_total += y_batch.size(0)

        val_loss /= len(test_loader)
        val_accuracy = val_correct / val_total

    print(f"Epoch {epoch+1}/{epochs} - loss: {train_loss:.4f} - accuracy: {train_accuracy:.4f} "
          f"- val_loss: {val_loss:.4f} - val_accuracy: {val_accuracy:.4f}")

print("Training complete.")

# Save the trained LSTM model
torch.save(model, "sentiment_model_full.pth")
print("Model saved")
Train...
Epoch 1/15 - loss: 0.6354 - accuracy: 0.6437 - val_loss: 0.6046 - val_accuracy: 0.6704
Epoch 2/15 - loss: 0.5834 - accuracy: 0.6973 - val_loss: 0.5639 - val_accuracy: 0.7096
Epoch 3/15 - loss: 0.5528 - accuracy: 0.7254 - val_loss: 0.5481 - val_accuracy: 0.7250
Epoch 4/15 - loss: 0.5369 - accuracy: 0.7378 - val_loss: 0.5398 - val_accuracy: 0.7306
Epoch 5/15 - loss: 0.5253 - accuracy: 0.7457 - val_loss: 0.5266 - val_accuracy: 0.7402
Epoch 6/15 - loss: 0.5194 - accuracy: 0.7494 - val_loss: 0.5175 - val_accuracy: 0.7476
Epoch 7/15 - loss: 0.5114 - accuracy: 0.7562 - val_loss: 0.5145 - val_accuracy: 0.7528
Epoch 8/15 - loss: 0.5069 - accuracy: 0.7583 - val_loss: 0.5242 - val_accuracy: 0.7432
Epoch 9/15 - loss: 0.5012 - accuracy: 0.7630 - val_loss: 0.5101 - val_accuracy: 0.7608
Epoch 10/15 - loss: 0.4968 - accuracy: 0.7646 - val_loss: 0.4995 - val_accuracy: 0.7614
Epoch 11/15 - loss: 0.4908 - accuracy: 0.7706 - val_loss: 0.5061 - val_accuracy: 0.7624
Epoch 12/15 - loss: 0.4861 - accuracy: 0.7718 - val_loss: 0.5018 - val_accuracy: 0.7660
Epoch 13/15 - loss: 0.4909 - accuracy: 0.7697 - val_loss: 0.5011 - val_accuracy: 0.7659
Epoch 14/15 - loss: 0.4831 - accuracy: 0.7773 - val_loss: 0.4996 - val_accuracy: 0.7660
Epoch 15/15 - loss: 0.4776 - accuracy: 0.7773 - val_loss: 0.4998 - val_accuracy: 0.7631
Training complete.
Model saved
In [254]:
# Ensure the model is in evaluation mode
model.eval()

# Initialize test loss and accuracy
correct = 0
total = 0
test_loss = 0.0
criterion = nn.BCELoss()

print("Testing...")

# Disable gradient computation for evaluation
with torch.no_grad():
    for batch_idx, (inputs, labels) in enumerate(test_loader):
        outputs = model(inputs)

        # Compute loss
        loss = criterion(outputs.squeeze(), labels.float())
        test_loss += loss.item()

        # Compute accuracy
        predicted_labels = (outputs.squeeze() >= 0.5).float()
        correct += (predicted_labels == labels).sum().item()
        total += labels.size(0)

        # Print progress bar with loss and accuracy
        print(f"\r{batch_idx+1}/{len(test_loader)} [{'=' * (batch_idx % 20)}] "
              f"- loss: {test_loss / (batch_idx+1):.4f} - accuracy: {correct/total:.4f}", end='')

# Compute final test loss and accuracy
test_loss /= len(test_loader)
accuracy = correct / total
Testing...
59/59 [==================] - loss: 0.4996 - accuracy: 0.76317
In [255]:
model.eval()

# Initialize lists to store true labels and predictions
true_labels = []
predictions = []

# Disable gradient calculation for evaluation
with torch.no_grad():
    for batch in test_loader:
        X_batch, y_batch = batch
        
        # Move data to the same device as the model
        X_batch = X_batch.to(next(model.parameters()).device)
        y_batch = y_batch.numpy()  # Convert true labels to NumPy for evaluation
        true_labels.extend(y_batch)

        # Get model predictions
        y_pred = model(X_batch)
        predicted_labels = (y_pred >= 0.5).float().cpu().numpy()  # Convert to binary labels
        predictions.extend(predicted_labels)

# Convert lists to NumPy arrays
true_labels = np.array(true_labels)
predictions = np.array(predictions)

# Generate Confusion Matrix
conf_matrix = confusion_matrix(true_labels, predictions)

# Plot the Confusion Matrix
plt.figure(figsize=(6,5))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=["Negative", "Positive"], yticklabels=["Negative", "Positive"])
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.title("Confusion Matrix")
plt.show()
No description has been provided for this image
In [256]:
# Generate Classification Report
class_report = classification_report(true_labels, predictions, target_names=["Negative", "Positive"])
print("Classification Report:\n", class_report)
Classification Report:
               precision    recall  f1-score   support

    Negative       0.75      0.96      0.84      4913
    Positive       0.85      0.38      0.52      2571

    accuracy                           0.76      7484
   macro avg       0.80      0.67      0.68      7484
weighted avg       0.78      0.76      0.73      7484

In [266]:
model.eval()

# Get first 5 samples from the test set
first_5_samples = next(iter(test_loader))  # Fetches one batch
X_batch, y_batch = first_5_samples

# Move input data to the same device as the model
X_batch = X_batch.to(next(model.parameters()).device)

# Get predictions
with torch.no_grad():
    y_pred = model(X_batch)

# Convert predictions to binary labels
predicted_labels = (y_pred >= 0.5).float().cpu().numpy()

# Convert true labels to numpy array
true_labels = y_batch.cpu().numpy()

print("First 5 Predictions vs True Labels")
for i in range(5):
    print(f"Observation {i+1}: Prediction = {predicted_labels[i]}, True Label = {true_labels[i]}")
First 5 Predictions vs True Labels
Observation 1: Prediction = 1.0, True Label = 1.0
Observation 2: Prediction = 0.0, True Label = 1.0
Observation 3: Prediction = 0.0, True Label = 0.0
Observation 4: Prediction = 0.0, True Label = 0.0
Observation 5: Prediction = 0.0, True Label = 0.0

Testing¶

In [268]:
import torch
import numpy as np
import gensim
from torch.nn.utils.rnn import pad_sequence

# Load trained Word2Vec embeddings
word2vec_model = gensim.models.KeyedVectors.load_word2vec_format('group_project_embedding_word2vec.txt', binary=False)

# Define preprocessing function to convert text into sequences
def text_to_sequence(text, word2vec_model, max_length):
    words = text.lower().split()
    sequence = [word2vec_model.index_to_key.index(word) if word in word2vec_model.index_to_key else 0 for word in words]
    return sequence[:max_length]  # Trim to max_length

# Define max_length
max_length = 100  # Adjust based on training

# new reviews
new_reviews = [
    "Not to my taste, will skip and watch another movie",
    "good movie!"
]

# Convert reviews to sequences
new_sequences = [text_to_sequence(review, word2vec_model, max_length) for review in new_reviews]

# Convert to tensor and pad sequences
new_sequences_tensor = [torch.tensor(seq) for seq in new_sequences]
new_sequences_tensor = pad_sequence(new_sequences_tensor, batch_first=True)

# Load full trained model
model = torch.load("sentiment_model_full.pth", weights_only=False)
model.eval()

# Perform predictions
with torch.no_grad():
    predictions = model(new_sequences_tensor)

# Convert predictions to binary labels
predicted_labels = ["Positive" if pred > 0.6 else ("Negative" if pred <= 0.4 else "Neutral")for pred in predictions]

for review, label in zip(new_reviews, predicted_labels): 
    print(f"Review: {review}\nPredicted Sentiment: {label}\n")
Review: Not to my taste, will skip and watch another movie
Predicted Sentiment: Neutral

Review: good movie!
Predicted Sentiment: Negative

3. Time Series Analysis¶

In [288]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
In [290]:
# Load file
tsl_stk = pd.read_csv("TSLA_stock_data.csv")
tsl_stk['Date'] = pd.to_datetime(tsl_stk['Date'])
tsl_stk.head(5)
Out[290]:
Date Open High Low Close Adj Close Volume
0 2021-09-30 260.333344 263.043335 258.333344 258.493347 258.493347 53868000
1 2021-10-01 259.466675 260.260010 254.529999 258.406677 258.406677 51094200
2 2021-10-04 265.500000 268.989990 258.706665 260.510010 260.510010 91449900
3 2021-10-05 261.600006 265.769989 258.066681 260.196655 260.196655 55297800
4 2021-10-06 258.733337 262.220001 257.739990 260.916656 260.916656 43898400
In [292]:
# Check csv
tsl_stk.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 252 entries, 0 to 251
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   Date       252 non-null    datetime64[ns]
 1   Open       252 non-null    float64       
 2   High       252 non-null    float64       
 3   Low        252 non-null    float64       
 4   Close      252 non-null    float64       
 5   Adj Close  252 non-null    float64       
 6   Volume     252 non-null    int64         
dtypes: datetime64[ns](1), float64(5), int64(1)
memory usage: 13.9 KB
In [294]:
tsl_twt = pd.read_csv("TSLA_tweets_data.csv")
tsl_twt['Date'] = pd.to_datetime(tsl_twt['Date']) 
tsl_twt.head()
Out[294]:
Date Tweet
0 2021-09-30 In other words, AMD has been giving Tesla pref...
1 2021-09-30 Get ready for a $TSLA _ _ _ _ _ _ Q3 delivery...
2 2021-09-30 Hold. On. Tight. $TSLA
3 2021-09-30 I agree with @freshjiva that $TSLA ‘s EV busin...
4 2021-09-30 Playing in the dirt and #chasingsunsets\n@tesl...
In [296]:
df = tsl_stk.copy()

3.1 Data Processing¶

In [314]:
#sentiment-score
sentiment_score_t = pd.read_csv('Tweet_sentiment_score.csv')
sentiment_score_d = pd.read_csv('Tweet_sentiment_score(every day).csv')
In [316]:
sentiment_score_t.head(3)
Out[316]:
Date Tweet sentiment_score Positive Neutral Negative
0 2021-09-30 In other words, AMD has been giving Tesla pref... 0.6590 0.166 0.834 0.0
1 2021-09-30 Get ready for a $TSLA _ _ _ _ _ _ Q3 delivery... 0.4215 0.257 0.743 0.0
2 2021-09-30 Hold. On. Tight. $TSLA 0.0000 0.000 1.000 0.0
In [318]:
sentiment_score_d.head(3)
Out[318]:
Date sentiment_score
0 2021-09-30 0.231552
1 2021-10-01 0.233704
2 2021-10-02 0.271940
In [320]:
sentiment_score_d['Date'] = pd.to_datetime(sentiment_score_d['Date'])
sentiment_score_d.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 365 entries, 0 to 364
Data columns (total 2 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   Date             365 non-null    datetime64[ns]
 1   sentiment_score  365 non-null    float64       
dtypes: datetime64[ns](1), float64(1)
memory usage: 5.8 KB
In [322]:
# The weekend results will be carried forward to the next trading day, 
# combining the scores of Saturday and Sunday into Monday
sc = sentiment_score_d.copy()
sc.set_index('Date', inplace=True)  

# Calculate the scores for Monday
weekends = sc[sc.index.weekday >= 5]   
mondays = sc[sc.index.weekday == 0]  
ave_sentiment = weekends.resample('W-Mon').mean() 
print(ave_sentiment.head())
mondays = mondays.add(ave_sentiment['sentiment_score'])/2  

# Update the sentiment score DataFrame by removing Saturday and Sunday data
sc = sc[sc.index.weekday < 5]  
sc.update(mondays)  

sc.reset_index(inplace=True)  
sc.head(10)
            sentiment_score
Date                       
2021-10-04         0.271755
2021-10-11         0.269741
2021-10-18         0.217852
2021-10-25         0.243395
2021-11-01         0.307372
Out[322]:
Date sentiment_score
0 2021-09-30 0.231552
1 2021-10-01 0.233704
2 2021-10-04 0.135388
3 2021-10-05 0.069445
4 2021-10-06 0.199940
5 2021-10-07 0.192548
6 2021-10-08 0.220011
7 2021-10-11 0.185286
8 2021-10-12 0.191255
9 2021-10-13 0.160097
In [324]:
# Distribution of sentiment scores
plt.figure(figsize=(6,4))
sns.histplot(sc['sentiment_score'], bins=50, kde=True, color='purple')
plt.title("Distribution of Sentiment Scores")
plt.xlabel("Sentiment")
plt.ylabel("Frequency")
plt.grid(color='gray', linestyle='--', linewidth=0.5, alpha=0.7)
plt.show()

plt.figure(figsize=(10,5))
plt.plot(sc['Date'], sc['sentiment_score'], label="Sentiment Score", color='green')
plt.axhline(y=0, color='red', linestyle='--', label="Neutral Sentiment")
plt.title("Sentiment Scores Over Time")
plt.xlabel("Date")
plt.ylabel("Sentiment Score")
plt.legend()
plt.grid()
plt.show()
plt.close('all')
No description has been provided for this image
No description has been provided for this image
In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

df = pd.read_csv('stock_yfinance_data.csv')

df_tsla = df[df['Stock Name'] == 'TSLA']

df_tsla = df_tsla[['Open', 'Close', 'High', 'Low', 'Volume']]

corr_matrix = df_tsla.corr()

plt.figure(figsize=(6, 5))  
sns.heatmap(
    corr_matrix,           
    annot=True,            
    cmap='coolwarm',        
    vmin=-1, vmax=1,        
    square=True            
)

plt.title('Correlation Matrix for TSLA')  
plt.show()
No description has been provided for this image
In [326]:
# Plot of stock prices and sentiment scores over time
from sklearn.preprocessing import StandardScaler  

stk,stt = df.copy(),sc.copy()
# Normalize the data
scaler_close = StandardScaler()  
stk['Close_normalized'] = scaler_close.fit_transform(stk[['Close']])  

scaler_sentiment = StandardScaler()  
stt['normalized'] = scaler_sentiment.fit_transform(stt[['sentiment_score']])  

plt.figure(figsize=(18, 6))
plt.plot(stk['Date'], stk['Close_normalized'], label="Close Price", color='blue',linewidth = 1.5)
plt.plot(stt['Date'], stt['normalized'], label="Sentiment Score", color='red',linewidth = 1.5,linestyle = '-',alpha = 0.4)
plt.title("TSLA Stock Prices and Sentiment Score Over Time")
plt.xlabel("Date")
plt.grid(True)
plt.legend()
plt.show()
No description has been provided for this image
In [328]:
# Merge into one dataset
merged_data = pd.merge(df, sc, on='Date', how='left')  #df stockdata/sc sentimentscore data
merged_data.head()
Out[328]:
Date Open High Low Close Adj Close Volume sentiment_score
0 2021-09-30 260.333344 263.043335 258.333344 258.493347 258.493347 53868000 0.231552
1 2021-10-01 259.466675 260.260010 254.529999 258.406677 258.406677 51094200 0.233704
2 2021-10-04 265.500000 268.989990 258.706665 260.510010 260.510010 91449900 0.135388
3 2021-10-05 261.600006 265.769989 258.066681 260.196655 260.196655 55297800 0.069445
4 2021-10-06 258.733337 262.220001 257.739990 260.916656 260.916656 43898400 0.199940
In [342]:
# Save data
merged_data.to_csv('merged_data.csv',index = False)

3.2 ACF/PACF¶

In [336]:
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf  
# Plot autocorrelation and partial autocorrelation functions
def acf(variable):
    fig, axes = plt.subplots(1,2,figsize = (13,4))
    plot_acf(tsl_stk[variable], lags=40, ax = axes[0])
    axes[0].set_title(f'Autocorrelation of {variable}')
    plot_pacf(tsl_stk[variable], lags=40, ax = axes[1])
    axes[1].set_title(f'Partial Autocorrelation of {variable}')
    plt.show()
In [338]:
tsl_stk.columns.values
Out[338]:
array(['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume'],
      dtype=object)
In [340]:
for var in list(tsl_stk.columns.values)[1:]:
    acf(var)
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

3.3 Time Series Prediction¶

In [1]:
# Split the dataset by window
def univariate_data(dataset, start_index, end_index, history_size, target_size):
    data = []
    labels = []
    
    start_index = start_index + history_size
    if end_index is None:
        end_index = len(dataset) - target_size

    for i in range(start_index, end_index):
        indices = range(i - history_size, i)
        # Convert dataset to a PyTorch tensor
        dataset_tensor = torch.tensor(dataset, dtype=torch.float32)
        data.append(dataset_tensor[indices].unsqueeze(-1))  # Adding extra dimension
        
        if target_size == 0:
            labels.append(dataset_tensor[i + target_size])
        else:
            labels.append(dataset_tensor[i : i + target_size])
        
    return torch.stack(data), torch.stack(labels)

3.3.1 Feature Engineering¶

In [2]:
import pandas as pd
#Load dataset
merged_data = pd.read_csv('merged_data.csv')
In [3]:
# Add S&P500 AND Nasadaq Composite Index
index = pd.read_csv('index.csv')
merged_data2 = pd.merge(merged_data,index,on = 'Date',how = 'left')
merged_data2.head()
Out[3]:
Date Open High Low Close Adj Close Volume sentiment_score S&P500 Nasdaq_Index
0 2021-09-30 260.333344 263.043335 258.333344 258.493347 258.493347 53868000 0.231552 4307.54 14689.62
1 2021-10-01 259.466675 260.260010 254.529999 258.406677 258.406677 51094200 0.233704 4357.04 14791.87
2 2021-10-04 265.500000 268.989990 258.706665 260.510010 260.510010 91449900 0.135388 4300.46 14472.12
3 2021-10-05 261.600006 265.769989 258.066681 260.196655 260.196655 55297800 0.069445 4345.72 14674.15
4 2021-10-06 258.733337 262.220001 257.739990 260.916656 260.916656 43898400 0.199940 4363.55 14766.75

Explore data information

In [4]:
df = merged_data2
df['Daily_Return(close)'] = df['Close'].pct_change()  
df['Daily_Return(S&P500)'] = df['S&P500'].pct_change()  
df['Daily_Return(Nasdaq_Index)'] = df['Nasdaq_Index'].pct_change()  

annual_volatility1 = df['Daily_Return(close)'].std() * (252 ** 0.5) * 100  
annual_volatility2 = df['Daily_Return(S&P500)'].std() * (252 ** 0.5) * 100  
annual_volatility3 = df['Daily_Return(Nasdaq_Index)'].std() * (252 ** 0.5) * 100  
print("annual_volatility: ")
print(f"Close:{annual_volatility1:.2f}%")  
print(f"S&P500:{annual_volatility2:.2f}%")  
print(f"Nasdaq_Index:{annual_volatility3:.2f}%")  
 
annual_volatility: 
Close:64.46%
S&P500:21.82%
Nasdaq_Index:29.76%
In [5]:
df['Date'] = pd.to_datetime(df['Date'])
In [6]:
import pandas_ta as pta  
import matplotlib.pyplot as plt 
from matplotlib.dates import MonthLocator, DateFormatter 
plt.close('all')
 
df.ta.adx(high='High', low='Low', close='Close', length=14, append=True) 

df.rename(columns={'ADX_14': 'ADX'}, inplace=True)  

df['SMA20'] = df['Close'].rolling(window=20).mean()  
df['SMA50'] = df['Close'].rolling(window=50).mean()  
df['MA_Divergence'] = abs(df['SMA20'] - df['SMA50']) / df['SMA50'] * 100  
 
plt.figure(figsize=(14, 6))  
plt.plot(df['Date'], df['ADX'], label='ADX ',lw = 1.5,color = 'purple')  
plt.axhline(y=25, color='r', linestyle='--', alpha=0.6,label = 'Strong Trend Threshold (25)') 
plt.axhline(y=20, color='g', linestyle='--', alpha=0.6,label = 'Weak Trend Threshold(20)')

plt.title("TSLA Stock Trend Strength (ADX)")  
plt.xlabel("Date")  
plt.ylabel("ADX Value") 
locator = MonthLocator(interval=1) 
formatter = DateFormatter('%Y-%m')  
plt.gca().xaxis.set_major_locator(locator)  
plt.gca().xaxis.set_major_formatter(formatter) 

plt.legend()  
plt.grid(True, which='major',alpha = 0.5)
plt.show()  
plt.close('all')
No description has been provided for this image

add extra feature

In [8]:
import seaborn as sns  
import matplotlib.pyplot as plt  

# Create correlation matrix  
correlation = df[['Close', 'sentiment_score', 'S&P500', 'Nasdaq_Index']].corr()  

# Plot using seaborn  
plt.figure(figsize=(5,5))  
heatmap = sns.heatmap(correlation, annot=True, fmt=".2f", cmap="RdBu_r",   
                     vmin=-1, vmax=1, cbar_kws={'label': 'Correlation'})  
plt.title("Correlation Matrix with Improved Features")  
plt.tight_layout()  
plt.show()  
plt.close('all')
No description has been provided for this image
In [10]:
df1 = merged_data2.copy()

# Add features related to closing prices and sentiment scores
# Calculate future return lag features, which refer to the rate of change in stock prices at future time points
for lag in [1,3,7,15]:  #acf:The autocorrelation is significant for lags ranging from 0 to 15.
    df1[f'future_returns_lag_{lag}'] = df1['Close'].shift(-lag) / df1['Close'] - 1

# The volatility and lag characteristics of sentiment scores
df1['sentiment_volatility'] = df1['sentiment_score'].rolling(window=3).std()
df1['lagged_sentiment'] = df1['sentiment_score'].shift(1)  #窗口1


# Plot the Correlation Matrix with Improved Features
correlation = df1[['sentiment_score', 'sentiment_volatility', 'lagged_sentiment',
                   'future_returns_lag_1', 'future_returns_lag_3','future_returns_lag_7', 'future_returns_lag_15','S&P500','Nasdaq_Index']].corr()
'''
import plotly.express as px
fig = px.imshow(
    correlation,
    text_auto=".2f",
    color_continuous_scale=px.colors.sequential.RdBu,
    labels=dict(color="Correlation"),
    title="Correlation Matrix with Improved Features"
)
fig.show()
'''
# Plot using seaborn  
plt.figure(figsize=(8,8))  
heatmap = sns.heatmap(correlation, annot=True, fmt=".2f", cmap="RdBu_r",   
                     vmin=-1, vmax=1, cbar_kws={'label': 'Correlation'})  
plt.title("Correlation Matrix with Improved Features")  
plt.tight_layout()  
plt.show()  
plt.close('all')
No description has been provided for this image
In [11]:
df1.fillna(0,inplace = True)
df1.head()
Out[11]:
Date Open High Low Close Adj Close Volume sentiment_score S&P500 Nasdaq_Index ... DMN_14 SMA20 SMA50 MA_Divergence future_returns_lag_1 future_returns_lag_3 future_returns_lag_7 future_returns_lag_15 sentiment_volatility lagged_sentiment
0 2021-09-30 260.333344 263.043335 258.333344 258.493347 258.493347 53868000 0.231552 4307.54 14689.62 ... 0.0 0.0 0.0 0.0 -0.000335 0.006589 0.021226 0.152834 0.000000 0.000000
1 2021-10-01 259.466675 260.260010 254.529999 258.406677 258.406677 51094200 0.233704 4357.04 14791.87 ... 0.0 0.0 0.0 0.0 0.008140 0.009713 0.039344 0.173447 0.000000 0.231552
2 2021-10-04 265.500000 268.989990 258.706665 260.510010 260.510010 91449900 0.135388 4300.46 14472.12 ... 0.0 0.0 0.0 0.0 -0.001203 0.015457 0.037810 0.311351 0.056152 0.233704
3 2021-10-05 261.600006 265.769989 258.066681 260.196655 260.196655 55297800 0.069445 4345.72 14674.15 ... 0.0 0.0 0.0 0.0 0.002767 0.006277 0.048335 0.304693 0.082659 0.135388
4 2021-10-06 258.733337 262.220001 257.739990 260.916656 260.916656 43898400 0.199940 4363.55 14766.75 ... 0.0 0.0 0.0 0.0 0.013874 0.011741 0.077011 0.325915 0.065248 0.069445

5 rows × 25 columns

3.3.2 Model Design¶

In [12]:
import matplotlib.pyplot as plt
import numpy as np
In [13]:
model_data = df1.copy()
In [240]:
# Chosen features
import torch  
import torch.nn as nn 

#Taking different features into consideration to show different performance
features_considered = ['Close'
                      # ,'sentiment_score'#]  
                      # , 'sentiment_volatility'
                      # , 'lagged_sentiment'
                       , 'future_returns_lag_1'
                       , 'future_returns_lag_3'
                       , 'future_returns_lag_7'
                       , 'future_returns_lag_15'
                       #, 'S&P500','Nasdaq_Index'
                      ]
features_considered_sentiment = ['Close','sentiment_score']
features_considered_sentiment_index = ['Close','sentiment_score','S&P500','Nasdaq_Index']
features_considered_close = ['Close']
features = model_data[features_considered]  # Feature data
features.index = model_data['Date']  
features.head()
Out[240]:
Close future_returns_lag_1 future_returns_lag_3 future_returns_lag_7 future_returns_lag_15
Date
2021-09-30 258.493347 -0.000335 0.006589 0.021226 0.152834
2021-10-01 258.406677 0.008140 0.009713 0.039344 0.173447
2021-10-04 260.510010 -0.001203 0.015457 0.037810 0.311351
2021-10-05 260.196655 0.002767 0.006277 0.048335 0.304693
2021-10-06 260.916656 0.013874 0.011741 0.077011 0.325915
In [241]:
# Visualization
features.plot(subplots = True,figsize = (7,7))
Out[241]:
array([<Axes: xlabel='Date'>, <Axes: xlabel='Date'>,
       <Axes: xlabel='Date'>, <Axes: xlabel='Date'>,
       <Axes: xlabel='Date'>], dtype=object)
No description has been provided for this image
In [242]:
# Convert the selected features to a NumPy array
dataset = features.values 
TRAIN_SPLIT = int(len(dataset) * 0.7)  # 70% as training dataset  
# Calculate the mean and standard deviation of the training split along each feature dimension
data_mean = torch.tensor(dataset[:TRAIN_SPLIT].mean(axis=0), dtype=torch.float32)  
data_std = torch.tensor(dataset[:TRAIN_SPLIT].std(axis=0), dtype=torch.float32)  
# Normalize the dataset using the calculated mean and standard deviation
dataset = (torch.tensor(dataset, dtype=torch.float32) - data_mean) / data_std  
In [243]:
# Generates training samples and labels from a multivariate time series dataset.
def multivariate_data(dataset, target, start_index, end_index, history_size, 
                      target_size, step, single_step=False):
    data = []
    labels = []
    
    start_index = start_index + history_size
   
    if end_index is None:
        end_index = len(dataset) - target_size

    for i in range(start_index, end_index):
        
        indices = range(i - history_size, i, step)
        
        dataset_tensor = dataset if isinstance(dataset, torch.Tensor) else torch.tensor(dataset, dtype=torch.float32)
        target_tensor = target if isinstance(target, torch.Tensor) else torch.tensor(target, dtype=torch.float32)

        # Append the input data (history) from this window
        data.append(dataset_tensor[indices])

        # Append the target data
        if single_step:
            labels.append(target_tensor[i + target_size])
        else:
            labels.append(target_tensor[i:i + target_size])

    return torch.stack(data), torch.stack(labels)
In [244]:
# Create training and validation datasets
# Define the history size and target size
FUTURE_STEP = 15
past_history = 35
future_target = FUTURE_STEP  
STEP = 1
# Create training and validation datasets for the multi-step model 
x_train_multi, y_train_multi = multivariate_data(
    dataset, dataset[:, 0], 0, TRAIN_SPLIT, past_history, future_target, STEP
)

x_val_multi, y_val_multi = multivariate_data(
    dataset, dataset[:, 0], TRAIN_SPLIT, None, past_history, future_target, STEP
)


print(f"x_train_multi shape: {x_train_multi.shape}, y_train_multi shape: {y_train_multi.shape}")  
print(f"x_val_multi shape: {x_val_multi.shape}, y_val_multi shape: {y_val_multi.shape}")  
x_train_multi shape: torch.Size([141, 35, 5]), y_train_multi shape: torch.Size([141, 15])
x_val_multi shape: torch.Size([26, 35, 5]), y_val_multi shape: torch.Size([26, 15])
In [245]:
#loader
from torch.utils.data import DataLoader, TensorDataset  

# Define the batch size
BATCH_SIZE = 100
# Create TensorDatasets for the training and validation data
train_data_multi = TensorDataset(x_train_multi, y_train_multi)
val_data_multi = TensorDataset(x_val_multi, y_val_multi)

# Create DataLoaders for batching and shuffling
train_data_loader_multi = DataLoader(train_data_multi, batch_size=BATCH_SIZE, shuffle=True)
val_data_loader_multi = DataLoader(val_data_multi, batch_size=BATCH_SIZE, shuffle=False)
In [246]:
dataset
Out[246]:
tensor([[-1.1726, -0.0056,  0.0846,  0.1794,  0.8712],
        [-1.1745,  0.1909,  0.1263,  0.3360,  0.9891],
        [-1.1287, -0.0257,  0.2029,  0.3228,  1.7776],
        ...,
        [-0.6404,  0.4012, -0.0032, -0.0041, -0.0026],
        [-0.5343, -1.5768, -0.0032, -0.0041, -0.0026],
        [-0.9611,  0.0021, -0.0032, -0.0041, -0.0026]])

3.3.3 Moving Average (Baseline)¶

In [26]:
#split the dataset by window
def univariate_data(dataset, start_index, end_index, history_size, target_size):
    data = []
    labels = []
    
    start_index = start_index + history_size
    if end_index is None:
        end_index = len(dataset) - target_size

    for i in range(start_index, end_index):
        indices = range(i - history_size, i)
        # Convert dataset to a PyTorch tensor
        dataset_tensor = torch.tensor(dataset, dtype=torch.float32)
        data.append(dataset_tensor[indices].unsqueeze(-1))  # Adding extra dimension
        
        if target_size == 0:
            labels.append(dataset_tensor[i + target_size])
        else:
            labels.append(dataset_tensor[i : i + target_size])
        
    return torch.stack(data), torch.stack(labels)
In [27]:
#define baseline testing and training set
uni_data = dataset[:,0]
print('uni_data shape:',uni_data.shape)
HISTORY_SIZE = 35  # Number of past time steps
TARGET_SIZE = 0    # Predict the next step

x_train_uni, y_train_uni = univariate_data(uni_data, 0, TRAIN_SPLIT, HISTORY_SIZE, TARGET_SIZE)
x_val_uni, y_val_uni = univariate_data(uni_data, TRAIN_SPLIT, None, HISTORY_SIZE, TARGET_SIZE)
print(f"x_train_uni shape: {x_train_uni.shape}")
print(f"y_train_uni shape: {y_train_uni.shape}")
print(f"x_val_uni shape: {x_val_uni.shape}")
print(f"y_val_uni shape: {y_val_uni.shape}")
uni_data shape: torch.Size([252])
x_train_uni shape: torch.Size([141, 35, 1])
y_train_uni shape: torch.Size([141])
x_val_uni shape: torch.Size([41, 35, 1])
y_val_uni shape: torch.Size([41])
C:\Users\aa139\AppData\Local\Temp\ipykernel_55528\4179888868.py:13: UserWarning:

To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).

In [28]:
# Function to create time steps for plotting
def create_time_steps(length):
    time_steps = []
    for i in range(-length, 0, 1):
        time_steps.append(i)
    return time_steps

def show_plot(plot_data, delta, title):
    labels = ['History', 'True Future', 'Model Prediction']
    marker = ['.-', 'rx', 'go']
    time_steps = create_time_steps(plot_data[0].shape[0])

    if delta:
        future = list(range(0,delta,1))
    else: #single step
        future = 0
        
    plt.figure(figsize=(8,5))
    plt.title(title)
    for i, x in enumerate(plot_data):
        if i:#true future data
            plt.plot(future, plot_data[i], marker[i], markersize=6, label=labels[i])
        else:#history data
            plt.plot(time_steps, plot_data[i].flatten(), marker[i], label=labels[i])
    plt.legend()
    plt.grid(alpha = 0.3)
    plt.xlim([time_steps[0] - 1, (future[-1] + 2)])
    plt.xlabel('Time-Step')
    return plt

One Step Baseline¶

In [3]:
# Baseline function to calculate the mean of the history
def baseline(history):
    return torch.mean(history).item()

# Generate a baseline prediction
example_data = x_train_uni[0]  # Single example (tensor)
example_label = y_train_uni[0]  # Corresponding label (tensor)

# Compute the baseline prediction
baseline_prediction = baseline(example_data)

# Show the plot using the previously defined show_plot function
show_plot(
    [example_data.numpy(), example_label.numpy(), np.array(baseline_prediction)],
    delta=0,
    title="Baseline Prediction"
).show()
No description has been provided for this image
In [30]:
def multi_step_baseline(history, window_size, target_size):  

    predictions = []  
    current_window = history.squeeze().numpy().copy()  # [35]
    
    for _ in range(target_size):   
        pred = np.mean(current_window[-window_size:])
        predictions.append(pred)  
        
        current_window = np.append(current_window[1:], pred)  
        
    return predictions  
In [31]:
# Function to create time steps for plotting
def create_time_steps(length):
    time_steps = []
    for i in range(-length, 0, 1):
        time_steps.append(i)
    return time_steps

def show_plot(plot_data, delta, title):
    labels = ['History', 'True Future', 'Model Prediction']
    marker = ['.-', 'rx', 'go']
    time_steps = create_time_steps(plot_data[0].shape[0])

    if delta:
        future = list(range(0,delta,1))
    else: #single step
        future = 0
        
    plt.figure(figsize=(8,5))
    plt.title(title)
    for i, x in enumerate(plot_data):
        if i:#true future data
            plt.plot(future, plot_data[i], marker[i], markersize=6, label=labels[i])
        else:#history data
            plt.plot(time_steps, plot_data[i].flatten(), marker[i], label=labels[i])
    plt.legend()
    plt.grid(alpha = 0.3)
    plt.xlim([time_steps[0] - 1, (future[-1] + 2)])
    plt.xlabel('Time-Step')
    return plt
In [33]:
uni_data = dataset[:,0]
print('uni_data shape:',uni_data.shape)
HISTORY_SIZE = 35  # Number of past time steps
TARGET_SIZE = 15 

x_train_uni, y_train_uni = univariate_data(uni_data, 0, TRAIN_SPLIT, HISTORY_SIZE, TARGET_SIZE)
x_val_uni, y_val_uni = univariate_data(uni_data, TRAIN_SPLIT, None, HISTORY_SIZE, TARGET_SIZE)
print(f"x_train_uni shape: {x_train_uni.shape}")
print(f"y_train_uni shape: {y_train_uni.shape}")
print(f"x_val_uni shape: {x_val_uni.shape}")
print(f"y_val_uni shape: {y_val_uni.shape}")
uni_data shape: torch.Size([252])
x_train_uni shape: torch.Size([141, 35, 1])
y_train_uni shape: torch.Size([141, 15])
x_val_uni shape: torch.Size([26, 35, 1])
y_val_uni shape: torch.Size([26, 15])
C:\Users\aa139\AppData\Local\Temp\ipykernel_55528\4179888868.py:13: UserWarning:

To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).

Multi Step Baseline¶

In [34]:
# Generate a baseline prediction
example_data = x_train_uni[45]  # Single example (tensor)
example_label = y_train_uni[45]  # Corresponding label (tensor)

BASELINE_WINDOW_SIZE = 15
# Compute the baseline prediction
baseline_prediction = multi_step_baseline(example_data,BASELINE_WINDOW_SIZE,TARGET_SIZE)

plt.show()
# Show the plot using the previously defined show_plot function
show_plot(
    [example_data.numpy(), example_label.numpy(), np.array(baseline_prediction)],
    delta= TARGET_SIZE,
    title="MA-Baseline Prediction"
).show()
No description has been provided for this image
In [2]:
from sklearn.metrics import mean_absolute_error, mean_squared_error  

def evaluate_regression(y_true, y_pred):  

    metrics = {  
        'MAE': mean_absolute_error(y_true, y_pred),  
        'RMSE': np.sqrt(mean_squared_error(y_true, y_pred)),  
        'MAPE': np.mean(np.abs((y_true - y_pred) / y_true)) * 100  
    }  
    return metrics  
 
y_true = example_label.numpy() 
y_pred = np.array(baseline_prediction)
print(evaluate_regression(y_true, y_pred))
#pd.DataFrame([evaluate_regression(y_true, y_pred)])
{'MAE': 0.257088303565979, 'RMSE': 0.32571183386762115, 'MAPE': 34.09973382949829}
In [3]:
BASELINE_WINDOW_SIZE = 15
mae = []
mse = []
rmse = []
for i in range(x_train_uni.shape[0]):
    example_data = x_train_uni[i]  # Single example (tensor)
    example_label = y_train_uni[i]  # Corresponding label (tensor)
    # Compute the baseline prediction
    baseline_prediction = multi_step_baseline(example_data,BASELINE_WINDOW_SIZE,TARGET_SIZE)
    y_true = example_label.numpy() 
    y_pred = np.array(baseline_prediction)
    
    mae.append(mean_absolute_error(y_true, y_pred))
    mse.append(mean_squared_error(y_true, y_pred))
    rmse.append(np.sqrt(mean_squared_error(y_true, y_pred)))
print(f'rmse:{np.mean(rmse)}  mse:{np.mean(mse)}  mae:{np.mean(mae)}')
rmse:0.7912054789702202  mse:0.7531945594183519  mae:0.6948035638383094

3.3.4 LSTM Model¶

In [89]:
#define loss function
def rmae_loss(predictions, targets):  
    absolute_errors = torch.abs(predictions - targets)   
    mean_absolute_error = torch.mean(absolute_errors)   
    root_mean_absolute_error = torch.sqrt(mean_absolute_error)  
    return root_mean_absolute_error 
loss_fn2 = rmae_loss
In [265]:
import torch  
import torch.nn as nn  

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
input_size = x_train_multi.shape[2]  # Number of input features  
hidden_size1 = 64 
hidden_size2 = 32  
output_size = FUTURE_STEP  # Number of future values to predict  

# Define a multi-layer LSTM model  
class LSTMModel(nn.Module):  
    def __init__(self):  
        super(LSTMModel, self).__init__()  
        self.lstm1 = nn.LSTM(input_size, hidden_size1, batch_first=True)  
        self.lstm2 = nn.LSTM(hidden_size1, hidden_size2, num_layers=2, dropout=0.4, batch_first=True)  
        self.dense = nn.Linear(hidden_size2, output_size)  

    def forward(self, x):  
        lstm_out, _ = self.lstm1(x)  
        lstm_out, _ = self.lstm2(lstm_out)  
        output = self.dense(lstm_out[:, -1, :])  # Take the output from the last time step  
        return output  

# Instantiate the model  
model = LSTMModel().to(device)
In [266]:
# Define the Optimizer
optimizer = torch.optim.RMSprop(
    list(model.lstm1.parameters()) + list(model.lstm2.parameters()) + list(model.dense.parameters()), 
    lr=0.001, alpha=0.9
)
loss_fn = nn.L1Loss()  # Use MAE as loss function

print(f"LSTM1: {model.lstm1}")
print(f"LSTM2: {model.lstm2}")
print(f"Dense: {model.dense}")
LSTM1: LSTM(5, 64, batch_first=True)
LSTM2: LSTM(64, 32, num_layers=2, batch_first=True, dropout=0.4)
Dense: Linear(in_features=32, out_features=15, bias=True)
In [267]:
# Training model
EPOCHS = 50
EVALUATION_INTERVAL = 100
validation_steps = 50
# Training loop
best_val_loss = 0 
patience = 10  
no_improve = 0 
for epoch in range(EPOCHS): 
    print(f"Epoch {epoch + 1}/{EPOCHS}")
    model.train()  

    train_loss = 0.0
    for step, (x_batch, y_batch) in enumerate(train_data_loader_multi):  
        x_batch, y_batch = x_batch.to(device), y_batch.to(device)  
        
        # Forward pass  
        predictions = model(x_batch)  
        
        # Calculate loss  
        loss_mae = loss_fn(predictions, y_batch)
        loss_rmae = loss_fn2(predictions, y_batch)
        loss = loss_mae + loss_rmae
        # Backward pass  
        optimizer.zero_grad()  
        loss.backward()  
        optimizer.step()  
        
        # Accumulate training loss
        train_loss += loss.item()
        # Limit steps per epoch
        if step + 1 == EVALUATION_INTERVAL:
            break
   
    # Average training loss
    train_loss /= step + 1
    
    val_loss = 0
    model.eval()
    with torch.no_grad():
        for step, (x_batch, y_batch) in enumerate(val_data_loader_multi):  
            x_batch, y_batch = x_batch.to(device), y_batch.to(device)  
            if step + 1 == validation_steps:
                break
            # Forward pass  
            predictions = model(x_batch)  

            # Calculate loss  
            loss_mae = loss_fn(predictions, y_batch)
            loss_rmae = loss_fn2(predictions, y_batch)
            loss = loss_mae + loss_rmae  

            # Accumulate validation loss
            val_loss += loss.item()

    # Average validation loss
    val_loss /= step + 1
          
    if val_loss < best_val_loss:  
        best_val_loss = val_loss  
        no_improve = 0  
    else:  
        no_improve += 1  
        if no_improve >= patience:  
            print("Early stopping")  
            break  
    print(f"MAE+RMAE - train_loss: {train_loss:.4f} - val_loss: {val_loss:.4f}")
Epoch 1/50
MAE+RMAE - train_loss: 1.7899 - val_loss: 1.1859
Epoch 2/50
MAE+RMAE - train_loss: 1.7711 - val_loss: 1.1449
Epoch 3/50
MAE+RMAE - train_loss: 1.7751 - val_loss: 1.0828
Epoch 4/50
MAE+RMAE - train_loss: 1.6671 - val_loss: 0.9896
Epoch 5/50
MAE+RMAE - train_loss: 1.6392 - val_loss: 0.9145
Epoch 6/50
MAE+RMAE - train_loss: 1.4807 - val_loss: 0.9812
Epoch 7/50
MAE+RMAE - train_loss: 1.4657 - val_loss: 1.0193
Epoch 8/50
MAE+RMAE - train_loss: 1.4024 - val_loss: 1.0488
Epoch 9/50
MAE+RMAE - train_loss: 1.3466 - val_loss: 1.1033
Epoch 10/50
Early stopping

Model Prediction Plot¶

In [95]:
# Function to create time steps for plotting
import matplotlib.pyplot as plt
import numpy as np
def create_time_steps(length):
    return list(range(-length, 0))

# Function for multi-step plotting
def multi_step_plot(history, true_future, prediction, title="Multi-Step Prediction"):
    plt.figure(figsize=(8, 4))
    num_in = create_time_steps(len(history))
    num_out = len(true_future)

    # Plot the history
    plt.plot(num_in, history[:, 0], label="History")  # Assuming index 1 corresponds to the target feature
    # Plot the true future
    plt.plot(np.arange(num_out), true_future, 'bo', label="True Future")
    # Plot the predicted future
    if prediction is not None:
        plt.plot(np.arange(num_out), prediction, 'ro', label="Predicted Future")

    plt.legend(loc="upper left")
    plt.title(title)
    plt.grid(color = 'gray',linestyle = '--',alpha = 0.7)
    plt.ylim(-3,0.5)
    plt.show()

    model.eval()

LSTM_1: Consider close prices:¶

features_considered = ['Close']

In [24]:
#Testing
model.eval
all_predictions = []
all_targets = []
# Iterate through the validation dataset
with torch.no_grad():  # Disable gradient calculation for evaluation
    for i, (x_batch, y__batch) in enumerate(val_data_loader_multi): 
        x = x_batch.to(device)
        y = y_batch.to(device)

        # Perform forward pass
        predictions = model(x_batch)
        
        all_predictions.append(predictions.cpu())
        all_targets.append(y_batch.cpu())
        # Print the shape of predictions
        print(predictions.shape)
        break
        
all_predictions = torch.cat(all_predictions, dim=0)
all_targets = torch.cat(all_targets, dim=0)

# Calculating MSE, RMSE, MAE
mse = torch.mean((all_predictions - all_targets) ** 2)
rmse = torch.sqrt(mse)
mae = torch.mean(torch.abs(all_predictions - all_targets))

print(f"Test RMSE: {rmse.item():.4f}")
print(f"Test MSE: {mse.item():.4f}")
print(f"Test MAE: {mae.item():.4f}")
torch.Size([26, 15])
Test RMSE: 0.4381
Test MSE: 0.1919
Test MAE: 0.3649
In [25]:
# Multi Features
with torch.no_grad():  # Disable gradient calculation for evaluation
    for i, (x, y) in enumerate(val_data_loader_multi): 

        x, y = x.to(device), y.to(device)

        # Perform forward pass
        predictions = model(x)  # Use the output of the last time step
        for j in range(10):
            history = x[j].cpu().numpy()  # First sample in the batch
            true_future = y[j].cpu().numpy()  # Corresponding ground truth
            predicted_future = predictions[j].cpu().numpy()  # Model's prediction

            multi_step_plot(history, true_future, predicted_future, title=f"Sample {j + 1}")
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

LSTM_2: Consider close prices and future_return:¶

features_considered = ['Close', 'future_returns_lag_1' , 'future_returns_lag_3' , 'future_returns_lag_7' , 'future_returns_lag_15']

In [268]:
#Testing
model.eval
all_predictions = []
all_targets = []
# Iterate through the validation dataset
with torch.no_grad():  # Disable gradient calculation for evaluation
    for i, (x_batch, y__batch) in enumerate(val_data_loader_multi): 
        x = x_batch.to(device)
        y = y_batch.to(device)

        # Perform forward pass
        predictions = model(x_batch)
        
        all_predictions.append(predictions.cpu())
        all_targets.append(y_batch.cpu())
        # Print the shape of predictions
        print(predictions.shape)
        break
        
all_predictions = torch.cat(all_predictions, dim=0)
all_targets = torch.cat(all_targets, dim=0)

# Calculating MSE, RMSE, MAE
mse = torch.mean((all_predictions - all_targets) ** 2)
rmse = torch.sqrt(mse)
mae = torch.mean(torch.abs(all_predictions - all_targets))

print(f"Test RMSE: {rmse.item():.4f}")
print(f"Test MSE: {mse.item():.4f}")
print(f"Test MAE: {mae.item():.4f}")
torch.Size([26, 15])
Test RMSE: 0.5059
Test MSE: 0.2559
Test MAE: 0.4495
In [269]:
# Multi Features
with torch.no_grad():  # Disable gradient calculation for evaluation
    for i, (x, y) in enumerate(val_data_loader_multi): 

        x, y = x.to(device), y.to(device)

        # Perform forward pass
        predictions = model(x)  # Use the output of the last time step
        for j in range(10):
            history = x[j].cpu().numpy()  # First sample in the batch
            true_future = y[j].cpu().numpy()  # Corresponding ground truth
            predicted_future = predictions[j].cpu().numpy()  # Model's prediction

            multi_step_plot(history, true_future, predicted_future, title=f"Sample {j + 1}")
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

After adding future_return_lag, the loss increased. LSTM itself can capture correlations in the data. The correlation between future_return and close price is linear and easier to capture, which may lead to data redundancy and increased noise. Therefore, variables related to future_return_lag are not used.

LSTM_3: Consider close prices and sentiment_score:¶

features_considered = ['Close','sentiment_score']

In [226]:
#Testing
model.eval
all_predictions = []
all_targets = []
# Iterate through the validation dataset
with torch.no_grad():  # Disable gradient calculation for evaluation
    for i, (x_batch, y__batch) in enumerate(val_data_loader_multi): 
        x = x_batch.to(device)
        y = y_batch.to(device)

        # Perform forward pass
        predictions = model(x_batch)
        
        all_predictions.append(predictions.cpu())
        all_targets.append(y_batch.cpu())
        # Print the shape of predictions
        print(predictions.shape)
        break
        
all_predictions = torch.cat(all_predictions, dim=0)
all_targets = torch.cat(all_targets, dim=0)

# Calculating MSE, RMSE, MAE
mse = torch.mean((all_predictions - all_targets) ** 2)
rmse = torch.sqrt(mse)
mae = torch.mean(torch.abs(all_predictions - all_targets))

print(f"Test RMSE: {rmse.item():.4f}")
print(f"Test MSE: {mse.item():.4f}")
print(f"Test MAE: {mae.item():.4f}")
torch.Size([26, 15])
Test RMSE: 0.2905
Test MSE: 0.0844
Test MAE: 0.2378
In [227]:
# Multi Features
with torch.no_grad():  # Disable gradient calculation for evaluation
    for i, (x, y) in enumerate(val_data_loader_multi): 

        x, y = x.to(device), y.to(device)

        # Perform forward pass
        predictions = model(x)  # Use the output of the last time step
        for j in range(10):
            history = x[j].cpu().numpy()  # First sample in the batch
            true_future = y[j].cpu().numpy()  # Corresponding ground truth
            predicted_future = predictions[j].cpu().numpy()  # Model's prediction

            multi_step_plot(history, true_future, predicted_future, title=f"Sample {j + 1}")
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

LSTM_4: Consider sentiment features:¶

features_considered = ['Close' ,'sentiment_score' , 'sentiment_volatility' ]

In [112]:
#Testing
model.eval
all_predictions = []
all_targets = []
# Iterate through the validation dataset
with torch.no_grad():  # Disable gradient calculation for evaluation
    for i, (x_batch, y__batch) in enumerate(val_data_loader_multi): 
        x = x_batch.to(device)
        y = y_batch.to(device)

        # Perform forward pass
        predictions = model(x_batch)
        
        all_predictions.append(predictions.cpu())
        all_targets.append(y_batch.cpu())
        # Print the shape of predictions
        print(predictions.shape)
        break
        
all_predictions = torch.cat(all_predictions, dim=0)
all_targets = torch.cat(all_targets, dim=0)

# Calculating MSE, RMSE, MAE
mse = torch.mean((all_predictions - all_targets) ** 2)
rmse = torch.sqrt(mse)
mae = torch.mean(torch.abs(all_predictions - all_targets))

print(f"Test RMSE: {rmse.item():.4f}")
print(f"Test MSE: {mse.item():.4f}")
print(f"Test MAE: {mae.item():.4f}")
torch.Size([26, 15])
Test RMSE: 0.2680
Test MSE: 0.0718
Test MAE: 0.2206
In [113]:
# Multi Features
with torch.no_grad():  # Disable gradient calculation for evaluation
    for i, (x, y) in enumerate(val_data_loader_multi): 

        x, y = x.to(device), y.to(device)

        # Perform forward pass
        predictions = model(x)  # Use the output of the last time step
        for j in range(10):
            history = x[j].cpu().numpy()  # First sample in the batch
            true_future = y[j].cpu().numpy()  # Corresponding ground truth
            predicted_future = predictions[j].cpu().numpy()  # Model's prediction

            multi_step_plot(history, true_future, predicted_future, title=f"Sample {j + 1}")
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

LSTM_5: Consider only index:¶

features_considered = ['Close' , 'S&P500','Nasdaq_Index']

In [976]:
#Testing
model.eval
all_predictions = []
all_targets = []
# Iterate through the validation dataset
with torch.no_grad():  # Disable gradient calculation for evaluation
    for i, (x_batch, y__batch) in enumerate(val_data_loader_multi): 
        x = x_batch.to(device)
        y = y_batch.to(device)

        # Perform forward pass
        predictions = model(x_batch)
        
        all_predictions.append(predictions.cpu())
        all_targets.append(y_batch.cpu())
        # Print the shape of predictions
        print(predictions.shape)
        break
        
all_predictions = torch.cat(all_predictions, dim=0)
all_targets = torch.cat(all_targets, dim=0)

# Calculating MSE, RMSE, MAE
mse = torch.mean((all_predictions - all_targets) ** 2)
rmse = torch.sqrt(mse)
mae = torch.mean(torch.abs(all_predictions - all_targets))

print(f"Test RMSE: {rmse.item():.4f}")
print(f"Test MSE: {mse.item():.4f}")
print(f"Test MAE: {mae.item():.4f}")
torch.Size([26, 15])
Test RMSE: 0.5512
Test MSE: 0.3038
Test MAE: 0.4635
In [978]:
# Multi Features
with torch.no_grad():  # Disable gradient calculation for evaluation
    for i, (x, y) in enumerate(val_data_loader_multi): 

        x, y = x.to(device), y.to(device)

        # Perform forward pass
        predictions = model(x)  # Use the output of the last time step
        for j in range(10):
            history = x[j].cpu().numpy()  # First sample in the batch
            true_future = y[j].cpu().numpy()  # Corresponding ground truth
            predicted_future = predictions[j].cpu().numpy()  # Model's prediction

            multi_step_plot(history, true_future, predicted_future, title=f"Sample {j + 1}")
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

LSTM_6: Consider all the features:¶

features_considered = ['Close' ,'sentiment_score' , 'sentiment_volatility' , 'S&P500','Nasdaq_Index']

In [214]:
#Testing
model.eval
all_predictions = []
all_targets = []
# Iterate through the validation dataset
with torch.no_grad():  # Disable gradient calculation for evaluation
    for i, (x_batch, y__batch) in enumerate(val_data_loader_multi): 
        x = x_batch.to(device)
        y = y_batch.to(device)

        # Perform forward pass
        predictions = model(x_batch)
        
        all_predictions.append(predictions.cpu())
        all_targets.append(y_batch.cpu())
        # Print the shape of predictions
        print(predictions.shape)
        break
        
all_predictions = torch.cat(all_predictions, dim=0)
all_targets = torch.cat(all_targets, dim=0)

# Calculating MSE, RMSE, MAE
mse = torch.mean((all_predictions - all_targets) ** 2)
rmse = torch.sqrt(mse)
mae = torch.mean(torch.abs(all_predictions - all_targets))

print(f"Test RMSE: {rmse.item():.4f}")
print(f"Test MSE: {mse.item():.4f}")
print(f"Test MAE: {mae.item():.4f}")
torch.Size([26, 15])
Test RMSE: 0.3022
Test MSE: 0.0913
Test MAE: 0.2491
In [215]:
# Multi Features
with torch.no_grad():  # Disable gradient calculation for evaluation
    for i, (x, y) in enumerate(val_data_loader_multi): 

        x, y = x.to(device), y.to(device)

        # Perform forward pass
        predictions = model(x)  # Use the output of the last time step
        for j in range(10):
            history = x[j].cpu().numpy()  # First sample in the batch
            true_future = y[j].cpu().numpy()  # Corresponding ground truth
            predicted_future = predictions[j].cpu().numpy()  # Model's prediction

            multi_step_plot(history, true_future, predicted_future, title=f"Sample {j + 1}")
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image